%pylab inline
Populating the interactive namespace from numpy and matplotlib
Common data formats:
# Using FRED requires a personal key
# To avoid exposing my key, I have it in a separate file
from keys import fred_key
series_id = 'GNPCA'
request_url = 'http://api.stlouisfed.org/fred/series?series_id=' + series_id + '&api_key=' + fred_key + '&file_type=json'
import urllib2
f = urllib2.urlopen(request_url)
data = f.read()
data
'{"realtime_start":"2016-04-07","realtime_end":"2016-04-07","seriess":[{"id":"GNPCA","realtime_start":"2016-04-07","realtime_end":"2016-04-07","title":"Real Gross National Product","observation_start":"1929-01-01","observation_end":"2015-01-01","frequency":"Annual","frequency_short":"A","units":"Billions of Chained 2009 Dollars","units_short":"Bil. of Chn. 2009 $","seasonal_adjustment":"Not Seasonally Adjusted","seasonal_adjustment_short":"NSA","last_updated":"2016-03-25 07:56:03-05","popularity":35,"notes":"BEA Account Code: A001RX1"}]}'
import json
json_data = json.loads(data)
type(json_data)
dict
json_data.keys()
[u'seriess', u'realtime_start', u'realtime_end']
json_data[u'realtime_start'], json_data[u'realtime_end']
(u'2016-04-07', u'2016-04-07')
json_data[u'seriess']
[{u'frequency': u'Annual', u'frequency_short': u'A', u'id': u'GNPCA', u'last_updated': u'2016-03-25 07:56:03-05', u'notes': u'BEA Account Code: A001RX1', u'observation_end': u'2015-01-01', u'observation_start': u'1929-01-01', u'popularity': 35, u'realtime_end': u'2016-04-07', u'realtime_start': u'2016-04-07', u'seasonal_adjustment': u'Not Seasonally Adjusted', u'seasonal_adjustment_short': u'NSA', u'title': u'Real Gross National Product', u'units': u'Billions of Chained 2009 Dollars', u'units_short': u'Bil. of Chn. 2009 $'}]
Other series: https://research.stlouisfed.org/fred2/tags/series
API docs:
https://research.stlouisfed.org/docs/api/fred/
Now let's get some data:
request_url = 'http://api.stlouisfed.org/fred/series/observations?series_id=' + series_id + '&api_key=' + fred_key + '&file_type=json'
import urllib2
f = urllib2.urlopen(request_url)
data = f.read()
json_data = json.loads(data)
json_data.keys()
[u'count', u'order_by', u'observation_start', u'file_type', u'observation_end', u'realtime_end', u'sort_order', u'limit', u'observations', u'offset', u'units', u'output_type', u'realtime_start']
json_data[u'count'],json_data[u'order_by'], json_data[u'observation_start'], json_data[ u'file_type'], json_data[ u'observation_end']
(87, u'observation_date', u'1776-07-04', u'json', u'9999-12-31')
json_data[u'realtime_start'], json_data[u'realtime_end'], json_data[u'sort_order'], json_data[u'limit'], json_data[u'offset']
(u'2016-04-07', u'2016-04-07', u'asc', 100000, 0)
json_data[u'output_type'], json_data[u'units']
(1, u'lin')
values = []
for o in json_data['observations']:
values.append(float(o['value']))
Should use something like the pandas module to ensure the data is consistent (e.g. the time series is equally spaced, ordered and there are no mising time values)
plot(values)
[<matplotlib.lines.Line2D at 0x7f72f0036ad0>]
f = open('../shared/gnpca.txt', 'w')
for v in values:
f.write(str(v) + '\n')
f.close()
Other sets:
series_id = 'UNRATE'
request_url = 'http://api.stlouisfed.org/fred/series/observations?series_id=' + series_id + '&api_key=' + fred_key + '&file_type=json'
f = urllib2.urlopen(request_url)
data = f.read()
json_data = json.loads(data)
values = []
for o in json_data['observations']:
values.append(float(o['value']))
plot(values)
[<matplotlib.lines.Line2D at 0x7f72e9dabc10>]
f = open('../shared/unrate.txt', 'w')
for v in values:
f.write(str(v) + '\n')
f.close()
series_id = 'GS10'
request_url = 'http://api.stlouisfed.org/fred/series/observations?series_id=' + series_id + '&api_key=' + fred_key + '&file_type=json'
f = urllib2.urlopen(request_url)
data = f.read()
json_data = json.loads(data)
values = []
for o in json_data['observations']:
values.append(float(o['value']))
plot(values)
[<matplotlib.lines.Line2D at 0x7f72e9ce65d0>]
f = open('../shared/gs10.txt', 'w')
for v in values:
f.write(str(v) + '\n')
f.close()
request_url = 'http://ichart.finance.yahoo.com/table.csv?s=^GSPC&ignore=.csv'
f = urllib2.urlopen(request_url)
data = f.read()
import csv
parsed_csv = csv.reader(data.split('\n'))
type(parsed_csv)
_csv.reader
parsed_csv.next()
['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
parsed_csv.next()
['2016-04-06', '2045.560059', '2067.330078', '2043.089966', '2066.659912', '3750800000', '2066.659912']
parsed_csv.next()
['2016-04-05', '2062.50', '2062.50', '2042.560059', '2045.170044', '4154920000', '2045.170044']
highs = []
for row in parsed_csv:
if len(row) > 0:
highs.append(float(row[2]))
plot(highs)
[<matplotlib.lines.Line2D at 0x7f72f0a464d0>]
plot(highs[::-1])
[<matplotlib.lines.Line2D at 0x7f72f0bf08d0>]
A long list of datasets: http://www.models.life.ku.dk/datasets
import urllib
urllib.urlretrieve('http://www.eigenvector.com/data/Corn/corn.mat', 'corn.mat')
('corn.mat', <httplib.HTTPMessage instance at 0x7f72f30060e0>)
from scipy.io import loadmat
corn = loadmat('corn.mat')
type(corn)
dict
corn.keys()
['mp6nbs', 'information', 'propvals', '__globals__', 'm5nbs', 'mp6spec', 'm5spec', 'mp5spec', '__header__', '__version__', 'mp5nbs']
corn['mp6spec'][0][0][7]
array([[-0.0227014, -0.0228025, -0.0228795, ..., 0.675079 , 0.674679 , 0.674056 ], [-0.0219211, -0.0220554, -0.0221607, ..., 0.682942 , 0.682648 , 0.682164 ], [-0.0208596, -0.0209931, -0.0211072, ..., 0.652276 , 0.651984 , 0.651517 ], ..., [-0.0178645, -0.0179813, -0.0180715, ..., 0.695484 , 0.695075 , 0.694381 ], [-0.0067957, -0.0068881, -0.0069559, ..., 0.690173 , 0.689855 , 0.689125 ], [-0.0152611, -0.0153799, -0.0154608, ..., 0.703188 , 0.70277 , 0.702071 ]])
plot(corn['mp6spec'][0][0][7][0,:])
[<matplotlib.lines.Line2D at 0x7f72f08f7190>]
Various ways to open excel sheets in python:
http://stackoverflow.com/questions/3239207/how-can-i-open-an-excel-file-in-python
Data can be encoded "raw" in a file. i.e. the file just contains bytes that are copied directly into memory. The user must know the format and data type of the data
fib = [1,1,2,3,5,8,13]
import struct
enc = struct.pack('i', 10)
enc
'\n\x00\x00\x00'
hex(10)
'0xa'
chr(10)
'\n'
len(enc)
4
ord('\n'), ord('a')
(10, 97)
'i'*len(fib)
'iiiiiii'
bytes = struct.pack('i'*len(fib), *fib)
bytes
'\x01\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x08\x00\x00\x00\r\x00\x00\x00'
fib = struct.unpack('iiiiiii', bytes)
fib
(1, 1, 2, 3, 5, 8, 13)
fib_f = struct.unpack('fffffff', bytes)
fib_f
(1.401298464324817e-45, 1.401298464324817e-45, 2.802596928649634e-45, 4.203895392974451e-45, 7.006492321624085e-45, 1.1210387714598537e-44, 1.8216880036222622e-44)
bytes = struct.pack('f'*len(fib), *fib)
bytes
'\x00\x00\x80?\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\xa0@\x00\x00\x00A\x00\x00PA'
fib_f = struct.unpack('fffffff', bytes)
fib_f
(1.0, 1.0, 2.0, 3.0, 5.0, 8.0, 13.0)
[Earthquake XML.ipynb](Earthquake XML.ipynb)
Common format for "big data". Specifically designed for large multi-dimensional data sets.
Several options depending on need:
By: Andrés Cabrera mantaraya36@gmail.com
For Course MAT 240F at UCSB
This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/